ad = read.csv('Advertising_Updated.csv')
lm <- lm(Sales~., data=ad)
new.dat <- data.frame(TV=200, Radio=10, Newspaper=20)
predict(lm, newdata = new.dat, interval = "confidence")
fit lwr upr
1 13.95637 13.60053 14.31221
library(dplyr)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(magrittr)
data = read.csv('KAG.csv', stringsAsFactors = FALSE)
head(data)
group_by(data, ad_id) %>%
summarise(., cpc = sum(CPC), imp = sum(Impressions)) %>%
filter(cpc < 1) %>%
arrange(., cpc, desc(imp))
The ad among ads with cpc = 0 that led to the most impressions is 1121094.
group_by(data, campaign_id) %>%
summarise(., spent = sum(Spent), imp = sum(Impressions), ) %>%
mutate(., cpm=spent/imp) %>%
arrange(., cpm)
Campaign 1178 was the least efficient on brand awareness on average.
summarise(data, spent = sum(Spent), tc = 5*sum(CostPerConv_Total), ac = 50*sum(CostPerConv_Approved), ) %>%
mutate(., roas=(tc+ac)/spent) %>%
arrange(., roas)
ROAS is 34.32.
library(ggplot2)
d = filter(data, interest==15 | interest==21 | interest==101) %>%
mutate(., interest=factor(interest)) %>%
group_by(., interest, gender) %>%
mutate(., roas=(CostPerConv_Total+CostPerConv_Approved)/Spent) %>%
filter(!is.na(roas)) %>%
arrange(., roas)
ggplot(d, aes(x = interest, y = roas, group = interest)) +
geom_boxplot() +
labs(x="Interest ID", y = "ROAS")
filter(data, campaign_id == 1178) %>%
mutate(., roas=(CostPerConv_Total+CostPerConv_Approved)/Spent, gender=factor(gender)) %>%
filter(!is.na(roas)) %>%
group_by(., gender) %>%
summarise(., mn = mean(roas), md = median(roas))
NA
library(readr)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mtibble [30m 2.1.3 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mtidyr [30m 1.0.2 [32m✓[30m [34mforcats[30m 0.4.0
[32m✓[30m [34mpurrr [30m 0.3.3 [39m
[30m── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mtidyr[30m::[32mextract()[30m masks [34mmagrittr[30m::extract()
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31mx[30m [34mpurrr[30m::[32mset_names()[30m masks [34mmagrittr[30m::set_names()[39m
library(correlationfunnel)
[38;5;238m══[39m [38;5;238mUsing correlationfunnel?[39m [38;5;238m══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════[39m[38;5;32m
You might also be interested in applied data science training for business.
[39m[38;5;32m</> Learn more at - www.business-science.io </>[39m
library(DataExplorer)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(WVPlots)
library(ggthemes)
library(ROCR)
Loading required package: gplots
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
library(caret)
Loading required package: lattice
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
library(corrplot)
corrplot 0.84 loaded
data = read.csv('advertising1.csv', stringsAsFactors = TRUE)
head(data)
ggplot(data, aes( y = Daily.Time.Spent.on.Site, )) +
geom_boxplot() +
labs(y = "Daily.Time.Spent.on.Site")
ggplot(data, aes( y = Daily.Internet.Usage, )) +
geom_boxplot() +
labs(y = "Daily.Internet.Usage")
ggplot(data, aes( y = Area.Income, )) +
geom_boxplot() +
labs(y = "Area.Income")
data
ggplot(data=data, aes(x=Male, y=Clicked.on.Ad)) +
geom_bar(stat="identity")
ggplot(data=data, aes(x=Age, y=Clicked.on.Ad)) +
geom_bar(stat="identity")
ggplot(data, aes( x= factor(Clicked.on.Ad),y = Age, )) +
geom_boxplot() +
labs(y = "Age")
ggplot(data, aes( x= factor(Clicked.on.Ad),y = Area.Income, )) +
geom_boxplot() +
labs(y = "Area.Income")
ggplot(data, aes( x= factor(Clicked.on.Ad),y = Daily.Internet.Usage, )) +
geom_boxplot() +
labs(y = "Daily.Internet.Usage")
ggplot(data, aes( x= factor(Clicked.on.Ad),y = Daily.Time.Spent.on.Site, )) +
geom_boxplot() +
labs(y = "Daily.Time.Spent.on.Site")
Based on our preliminary boxplots, I would expect an older person to be more likely to click on the ad than someone younger.
ggplot(data, aes(x=Age, y=Area.Income, shape=factor(Clicked.on.Ad), color=factor(Clicked.on.Ad))) +
geom_point()
Based on this plot I would not expect a 32 year old making an income of $62,000 to click on the add, because it seems like clicks have a positive correlation with age and negative correlation with income. The majority of clicks happer at ages > 32 and incomes < $60,000
ggplot(data, aes(x=Age, y=Daily.Time.Spent.on.Site, shape=factor(Clicked.on.Ad), color=factor(Clicked.on.Ad))) +
geom_point()
Based on this plot, I would not expect a 50-year-old person who spends 60 minutes daily on the site to click on the ad because there seems to be a negative correlation between time spent on the site and clicks. People that spend more time on the site are less likely to click on the ad.
data %>%
mutate_if(is.numeric,as.numeric)%>%
binarize() %>%
correlate(Clicked.on.Ad__1) %>%
plot_correlation_funnel(interactive = TRUE, alpha = 0.7)
model = glm(Clicked.on.Ad~Daily.Internet.Usage + Daily.Time.Spent.on.Site + Age + Area.Income,data=data,family=binomial())
summary(model)
Call:
glm(formula = Clicked.on.Ad ~ Daily.Internet.Usage + Daily.Time.Spent.on.Site +
Age + Area.Income, family = binomial(), data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4578 -0.1341 -0.0333 0.0167 3.1961
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.713e+01 2.714e+00 9.995 < 2e-16 ***
Daily.Internet.Usage -6.391e-02 6.745e-03 -9.475 < 2e-16 ***
Daily.Time.Spent.on.Site -1.919e-01 2.066e-02 -9.291 < 2e-16 ***
Age 1.709e-01 2.568e-02 6.655 2.83e-11 ***
Area.Income -1.354e-04 1.868e-05 -7.247 4.25e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1386.3 on 999 degrees of freedom
Residual deviance: 182.9 on 995 degrees of freedom
AIC: 192.9
Number of Fisher Scoring iterations: 8
data$predict = predict(model, data, type="response")>=.8
data$test = data$Clicked.on.Ad==1
confusionMatrix(factor(data$predict), factor(data$test))
Confusion Matrix and Statistics
Reference
Prediction FALSE TRUE
FALSE 497 36
TRUE 3 464
Accuracy : 0.961
95% CI : (0.9471, 0.9721)
No Information Rate : 0.5
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.922
Mcnemar's Test P-Value : 2.99e-07
Sensitivity : 0.9940
Specificity : 0.9280
Pos Pred Value : 0.9325
Neg Pred Value : 0.9936
Prevalence : 0.5000
Detection Rate : 0.4970
Detection Prevalence : 0.5330
Balanced Accuracy : 0.9610
'Positive' Class : FALSE